import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import time
from collections import namedtuple
from scipy import stats
%matplotlib inline
np.set_printoptions(precision=3, suppress=1)
class Hallway(object):
def __init__(self, goal_loc, tabular=True, vision_size=1, discount=0.98, noisy=False):
# 10: Key
# -2: Door
# -1: wall
# 0: empty, episode continues
# other: number indicates reward, episode will terminate
self._wall = -1
self._door = -2
self._key = 10
# self._layout = np.array([
# [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
# [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
# [-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1],
# [-1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1],
# [-1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1],
# [-1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1],
# [-1, -1, 0, -2, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, -1, -1],
# [-1, -1, 0, -2, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, -1, -1],
# [-1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1],
# [-1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1],
# [-1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1],
# [-1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1],
# [-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1],
# [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
# [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
# ])
# ROOMS
self._layout = np.array([
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
[-1, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, -1, -1],
[-1, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, -1, -1],
[-1, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, -1, -1],
[-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1],
[-1, -1, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, -1, -1],
[-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -1, -1],
[-1, 0, 0, 0, -1, -1, -1, 0, 0, -1, -1, -1, 0, 0, 0, -1, -1],
[-1, 0, 0, 0, -1, -1, -1, 0, 0, -1, -1, -1, 0, 0, 0, -1, -1],
[-1, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, -1, -1],
[-1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
])
self._goals = set()
self._goal_loc = []
for e in goal_loc:
self._layout[e[0],e[1]] = e[2]
self._goal_loc.append((e[0],e[1]))
self._goals.add(e[2])
#self._goal = value
# row, col format
# self._start_state = (12, 8)
self._start_state = (11, 8)
self._state = self._start_state
self._number_of_states = np.prod(np.shape(self._layout))
self._noisy = noisy
self._tabular = tabular
self._vision_size = vision_size
self._discount = discount
self._distanceToGoal = None
#self.distanceToGoal()
def resetState(self):
self._state = self._start_state
def distanceToGoal(self):
return np.count_nonzero(self._layout != -1)*10
def maxDistanceTwoPoints(self, cy, cx, dy, dx):
distances = []
stack = []
visited = set()
stack.append((cy, cx, 0))
while len(stack) != 0:
#print len(stack)
cur_row, cur_col, dist = stack.pop()
if (cur_row, cur_col) in visited:
continue
visited.add((cur_row, cur_col))
if (cur_row, cur_col) == (dy, dx):
distances.append(dist)
if cur_row+1 < self._layout.shape[0] and self._layout[cur_row+1, cur_col] != self._wall:
stack.append((cur_row+1, cur_col, dist+1))
if cur_row-1 > -1 and self._layout[cur_row-1, cur_col] != self._wall:
stack.append((cur_row-1, cur_col, dist+1))
if cur_col+1 < self._layout.shape[1] and self._layout[cur_row, cur_col+1] != self._wall:
stack.append((cur_row, cur_col+1, dist+1))
if cur_col-1 > -1 and self._layout[cur_row, cur_col-1] != self._wall:
stack.append((cur_row, cur_col-1, dist+1))
return np.max(np.array(distances))
def distanceToNearestGoal(self, new_y, new_x):
distances = []
locations = []
stack = []
visited = set()
stack.append((new_y, new_x, 0))
while len(stack) != 0:
#print len(stack)
cur_row, cur_col, dist = stack.pop()
if (cur_row, cur_col) in visited:
continue
visited.add((cur_row, cur_col))
for e in self._goal_loc:
if (cur_row, cur_col) == e:
distances.append(dist)
locations.append(e)
if cur_row+1 < self._layout.shape[0] and self._layout[cur_row+1, cur_col] != self._wall:
stack.append((cur_row+1, cur_col, dist+1))
if cur_row-1 > -1 and self._layout[cur_row-1, cur_col] != self._wall:
stack.append((cur_row-1, cur_col, dist+1))
if cur_col+1 < self._layout.shape[1] and self._layout[cur_row, cur_col+1] != self._wall:
stack.append((cur_row, cur_col+1, dist+1))
if cur_col-1 > -1 and self._layout[cur_row, cur_col-1] != self._wall:
stack.append((cur_row, cur_col-1, dist+1))
# Has to be the absolute closest person
argmin_dist = np.argmin(np.array(distances))
return distances[argmin_dist], locations[argmin_dist]
def handleDoor(self):
pass
@property
def number_of_states(self):
return self._number_of_states
def plot_grid(self, title=None):
plt.figure(figsize=(4, 4))
plt.imshow(self._layout != self._wall, interpolation="nearest", cmap='pink')
ax = plt.gca()
ax.grid(0)
plt.xticks([])
plt.yticks([])
if title != None:
plt.title(title)
else:
plt.title("The Grid")
plt.text(self._start_state[1], self._start_state[0], r"$\mathbf{S}$", ha='center', va='center')
for e in self._goals:
if e == self._key:
y = np.where(self._layout==e)[0]
x = np.where(self._layout==e)[1]
for i in range(y.shape[0]):
plt.text(x[i], y[i], r"$\mathbf{K}$", ha='center', va='center')
elif e > 0:
y = np.where(self._layout==e)[0]
x = np.where(self._layout==e)[1]
for i in range(y.shape[0]):
plt.text(x[i], y[i], r"$\mathbf{G}$", ha='center', va='center')
y = np.where(self._layout==self._door)[0]
x = np.where(self._layout==self._door)[1]
for i in range(y.shape[0]):
plt.text(x[i], y[i], r"$\mathbf{D}$", ha='center', va='center')
h, w = self._layout.shape
for y in range(h-1):
plt.plot([-0.5, w-0.5], [y+0.5, y+0.5], '-k', lw=2)
for x in range(w-1):
plt.plot([x+0.5, x+0.5], [-0.5, h-0.5], '-k', lw=2)
def get_obs(self):
y, x = self._state
return self.get_obs_at(x, y)
def get_obs_at(self, x, y):
if self._tabular:
return y*self._layout.shape[1] + x
else:
v = self._vision_size
#location = np.clip(-self._layout[y-v:y+v+1,x-v:x+v+1], 0, 1)
#location = np.clip(-self._layout[y-v:y+v+1,x-v:x+v+1], -1, 2)
location = self._layout[y-v:y+v+1,x-v:x+v+1]
return location
def step(self, action, agent_inventory):
item = None
y, x = self._state
if action == 0: # up
new_state = (y - 1, x)
elif action == 1: # right
new_state = (y, x + 1)
elif action == 2: # down
new_state = (y + 1, x)
elif action == 3: # left
new_state = (y, x - 1)
else:
raise ValueError("Invalid action: {} is not 0, 1, 2, or 3.".format(action))
new_y, new_x = new_state
discount = self._discount
if self._layout[new_y, new_x] == self._wall: # a wall
reward = -1
new_state = (y, x)
elif self._layout[new_y, new_x] == self._key: # a key
reward = self._layout[new_y, new_x]
item = 'KEY'
discount = 0.
new_state = self._start_state
elif self._layout[new_y, new_x] == self._door: # a door
reward = 5
if 'KEY' not in agent_inventory:
reward = self._layout[new_y, new_x]
new_state = (y, x)
elif self._layout[new_y, new_x] > 0: # a goal
reward = self._layout[new_y, new_x]
discount = 0.
new_state = self._start_state
else:
distToNearestGoal, nearestGoal = self.distanceToNearestGoal(new_y, new_x)
maxDistance = self.maxDistanceTwoPoints(self._start_state[0], self._start_state[1], nearestGoal[0], nearestGoal[1])
reward = self._layout[nearestGoal[0],nearestGoal[1]]*np.exp(-distToNearestGoal/maxDistance)
if self._noisy:
width = self._layout.shape[1]
reward += 2*np.random.normal(0, width - new_x + new_y)
self._state = new_state
return reward, discount, self.get_obs(), item
tasks = []
tasks.append(Hallway(goal_loc = [(8,1,5), (8,2,5), (8,3,5), (2,1,5), (2,2,5), (2,3,5), (2,12,5), (2,13,5), (2,14,5), (5,4,5), (8,13,5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(5, 4, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 1, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 2, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 3, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 1, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 2, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 3, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 12, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 13, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 14, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 13, 5)], discount=0.98))
for idt,task in enumerate(tasks):
task.plot_grid(title="grid_{}".format(idt) )
class GeneralQ(object):
def __init__(self, number_of_states, number_of_actions, initial_state, target_policy, behaviour_policy, double, num_offline_updates=30, step_size=0.1):
self._q = np.zeros((number_of_states, number_of_actions))
self._replayBuffer_A = []
if double:
self._q2 = np.zeros((number_of_states, number_of_actions))
self._replayBuffer_B = []
self._s = initial_state
self._initial_state = initial_state
self._number_of_actions = number_of_actions
self._step_size = step_size
self._behaviour_policy = behaviour_policy
self._target_policy = target_policy
self._double = double
self._num_offline_updates = num_offline_updates
self._last_action = 0
self._inventory = set()
def resetReplayBuffer(self):
self._replayBuffer_A = []
if self._double:
self._replayBuffer_B = []
@property
def q_values(self):
if self._double:
return (self._q + self._q2)/2
else:
return self._q
def resetState(self):
self._s = self._initial_state
def step(self, r, g, s, item, train):
td = None
if item != None:
self._inventory.add(item)
if self._double:
next_action = self._behaviour_policy(self.q_values[s,:], train)
if np.random.random() <= 0.5:
expectation = np.sum(self._target_policy(self._q[s,:], next_action) * self._q2[s,:])
td = self._step_size*(r + g*expectation - self._q[self._s,self._last_action])
if train == True:
self._q[self._s,self._last_action] += self._step_size*(r + g*expectation - self._q[self._s,self._last_action])
#self._q[self._s,self._last_action] += self._step_size*(r + g*self._q2[s,np.argmax(target_policy(self._q[s,:], next_action))] - self._q[self._s,self._last_action])
self._replayBuffer_A.append([self._s,self._last_action,r,g,s,next_action])
for _ in range(self._num_offline_updates):
replay = self._replayBuffer_A[np.random.randint(len(self._replayBuffer_A))]
expectation = np.sum(self._target_policy(self._q[replay[4],:], replay[5]) * self._q2[replay[4],:])
self._q[replay[0],replay[1]] += self._step_size*(replay[2] + replay[3] * expectation - self._q[replay[0],replay[1]])
else:
expectation = np.sum(self._target_policy(self._q2[s,:], next_action) * self._q[s,:])
td = self._step_size*(r + g*expectation - self._q2[self._s,self._last_action])
if train == True:
self._q2[self._s,self._last_action] += self._step_size*(r + g*expectation - self._q2[self._s,self._last_action])
#self._q2[self._s,self._last_action] += self._step_size*(r + g*self._q[s,np.argmax(target_policy(self._q2[s,:], next_action))] - self._q2[self._s,self._last_action])
self._replayBuffer_B.append([self._s,self._last_action,r,g,s,next_action])
for _ in range(self._num_offline_updates):
replay = self._replayBuffer_B[np.random.randint(len(self._replayBuffer_B))]
expectation = np.sum(self._target_policy(self._q[replay[4],:], replay[5]) * self._q2[replay[4],:])
self._q[replay[0],replay[1]] += self._step_size*(replay[2] + replay[3] * expectation - self._q[replay[0],replay[1]])
self._s = s
self._last_action = next_action
return self._last_action, self._inventory, td
else:
next_action = self._behaviour_policy(self._q[s,:], train)
# This is expected sarsa, but still functions as expected.
expectation = np.sum(self._target_policy(self._q[s,:], next_action) * self._q[s,:])
td = self._step_size*(r + g*expectation - self._q[self._s,self._last_action])
if train == True:
self._q[self._s,self._last_action] += self._step_size*(r + g*expectation - self._q[self._s,self._last_action])
#self._q[self._s,self._last_action] += self._step_size*(r + g*self._q[s,np.argmax(target_policy(self._q[s,:], next_action))] - self._q[self._s,self._last_action])
self._replayBuffer_A.append([self._s,self._last_action,r,g,s,next_action])
for _ in range(self._num_offline_updates):
replay = self._replayBuffer_A[np.random.randint(len(self._replayBuffer_A))]
expectation = np.sum(self._target_policy(self._q[replay[4],:], replay[5]) * self._q2[replay[4],:])
self._q[replay[0],replay[1]] += self._step_size*(replay[2] + replay[3] * expectation - self._q[replay[0],replay[1]])
self._s = s
self._last_action = next_action
#print(self._inventory)
return self._last_action, self._inventory, td
def Q_target_policy(q, a):
return np.eye(len(q))[np.argmax(q)]
def SARSA_target_policy(q, a):
return np.eye(len(q))[a]
def gen_behaviour_policy(q, train):
#return epsilon_greedy(q, 0.1) if train == True else np.random.choice(np.where(np.max(q) == q)[0])
return epsilon_greedy(q, 0.2) if train == True else epsilon_greedy(q, 0.05)
An agent that uses Neural-Sarsa/DQN to learn action values. The agent should expect a nxn input which it should flatten into a vector, and then pass through a multi-layer perceptron with a single hidden layer with 100 hidden nodes and ReLU activations. Each weight layer should also have a bias. Initialize all weights uniformly randomly in $[-0.05, 0.05]$.
NeuralSarsa(number_of_features=(2*vision_size + 1)**2,
number_of_hidden=100,
number_of_actions=4,
initial_state=grid.get_obs(),
step_size=0.01)
DQN(number_of_features=(2*vision_size + 1)**2,
number_of_hidden=100,
number_of_actions=4,
initial_state=grid.get_obs(),
step_size=0.01)
The number vision_size will be either 1 or 2 below. The input vector will be of size $(2v + 1)^2$, which will correspond to a square local view of the grid, centered on the agent, and of size $(2v + 1) \times (2v + 1)$ (so either 3x3 or 5x5).
class NEURAL_CONTROLLER_DRIVER(object):
# Target Network is the same, as C-step is just C=1
def __init__(self, number_of_features_controller,
number_of_features_driver,
number_of_hidden_controller,
number_of_hidden_driver,
number_of_actions_controller,
number_of_actions_driver,
initial_state_controller,
initial_state_driver,
rl_alg_controller='DQN',
rl_alg_driver='DQN',
num_offline_updates_controller=20,
num_offline_updates_driver=25,
step_size_controller=0.01,
step_size_driver=0.01):
# HMMM?
tf.reset_default_graph()
self._prev_action_driver = 0
self._step_driver = step_size_driver
self._num_features_driver = number_of_features_driver
self._num_action_driver = number_of_actions_driver
self._num_hidden_driver = number_of_hidden_driver
self._initial_state_driver = initial_state_driver
self._s_driver = initial_state_driver
self._s_driver = np.reshape(self._s_driver, (1,-1))
self._times_trained_driver = 0
self._inventory = set()
self._replayBuffer_driver = []
self._num_offline_updates_driver = num_offline_updates_driver
self._rl_alg_driver = rl_alg_driver
self._prev_action_controller = 0
self._step_controller = step_size_controller
self._num_features_controller = number_of_features_controller
self._num_action_controller = number_of_actions_controller
self._num_hidden_controller = number_of_hidden_controller
self._initial_state_controller = initial_state_controller
self._s_controller = initial_state_controller
self._s_controller = np.reshape(self._s_controller, (1,-1))
self._times_trained_controller = 0
self._replayBuffer_controller = []
self._num_offline_updates_controller = num_offline_updates_controller
self._rl_alg_controller = rl_alg_controller
self.name = 'HYPER '+self._rl_alg_controller
# ?????????? should it be the number of tasks
self._probs_controller = np.ones((1, self._num_features_controller))/(self._num_features_controller*1.)
self._times_used = 0.
self.handleTF()
def reset(self):
tf.reset_default_graph()
self.handleTF()
self.resetState_controller()
self.resetReplayBuffer_controller()
self._probs_controller = np.ones((1, self._num_features_controller))/(self._num_features_controller*1.)
self._times_trained_controller = 0
self._prev_action_controller = 0
self.resetReplayBuffer()
self.resetState()
self._times_trained_driver = 0
self._prev_action_driver = 0
self._inventory = set()
self._times_used = 0
def resetReplayBuffer_controller(self):
self._replayBuffer_controller = []
def resetState_controller(self):
self._s_controller = self._initial_state_controller
self._s_controller = np.reshape(self._s_controller, (1,-1))
def handleTF(self):
self._sess = tf.Session()
#tf.reset_default_graph()
self.rewTensor_controller = tf.placeholder(tf.float64)
self.disTensor_controller = tf.placeholder(tf.float64)
self.nqTensor_controller = tf.placeholder(tf.float64)
self.actionTensor_controller = tf.placeholder(tf.int32)
self.stateTensor_controller = tf.placeholder(tf.float64, shape=(1,self._num_features_controller))
self._dense_1_controller = tf.layers.dense(self.stateTensor_controller,
self._num_hidden_controller, activation=tf.nn.relu,
kernel_initializer=tf.random_uniform_initializer(-0.05, 0.05),
bias_initializer=tf.random_uniform_initializer(-0.05, 0.05))
self._dense_2_controller = tf.layers.dense(self._dense_1_controller,
self._num_action_controller, activation=None,
kernel_initializer=tf.random_uniform_initializer(-0.05, 0.05),
bias_initializer=tf.random_uniform_initializer(-0.05, 0.05))
self._q_controller = tf.reshape(self._dense_2_controller, (self._num_action_controller,))
self._softmx_controller = tf.nn.softmax(self._q_controller)
self._cost_controller = tf.losses.mean_squared_error(self.rewTensor_controller + self.disTensor_controller*self.nqTensor_controller, self._q_controller[self.actionTensor_controller])
self._opt_controller = tf.train.GradientDescentOptimizer(self._step_controller).minimize(self._cost_controller)
self.rewTensor_driver = tf.placeholder(tf.float64)
self.disTensor_driver = tf.placeholder(tf.float64)
self.nqTensor_driver = tf.placeholder(tf.float64)
self.actionTensor_driver = tf.placeholder(tf.int32)
self.stateTensor_driver = tf.placeholder(tf.float64, shape=(1,self._num_features_driver))
self._dense_1_driver = tf.layers.dense(self.stateTensor_driver,
self._num_hidden_driver, activation=tf.nn.relu,
kernel_initializer=tf.random_uniform_initializer(-0.05, 0.05),
bias_initializer=tf.random_uniform_initializer(-0.05, 0.05))
self._dense_2_driver = tf.layers.dense(self._dense_1_driver,
self._num_action_driver, activation=None,
kernel_initializer=tf.random_uniform_initializer(-0.05, 0.05),
bias_initializer=tf.random_uniform_initializer(-0.05, 0.05))
self._q_driver = tf.reshape(self._dense_2_driver, (self._num_action_driver,))
self._cost_driver = tf.losses.mean_squared_error(self.rewTensor_driver+ self.disTensor_driver*self.nqTensor_driver, self._q_driver[self.actionTensor_driver])
self._opt_driver = tf.train.GradientDescentOptimizer(self._step_driver).minimize(self._cost_driver)
# HMMM?
self._sess.run(tf.global_variables_initializer())
def _target_policy_controller(self, q, a):
return np.eye(len(q))[a]
def _behaviour_policy_controller(self, q):
return epsilon_greedy(q, 0.2)# if train == True else np.random.choice(np.where(np.max(q) == q)[0])
def getProbs(self):
# softmax
return self._probs_controller
def q_controller(self, obs):
#print [n.name for n in tf.get_default_graph().as_graph_def().node]
obs = np.reshape(obs,(1,-1))
#print obs
t, probs = self._sess.run([self._q_controller, self._softmx_controller], {self.stateTensor_controller: obs})
return t, probs
def step_controller(self, r, g, s):
self._times_used += 1
#print self._times_used
qvs, probs = self.q_controller(s)
q_nxtState = np.reshape(qvs, (-1,))
self._probs_controller = probs
next_action = self._behaviour_policy_controller(q_nxtState)
if r != None:
if self._rl_alg_controller == 'NEURALSARSA':
target = self._target_policy_controller(q_nxtState, next_action)
target = np.random.choice(np.where(np.max(target) == target)[0])
vob = q_nxtState[target]
#print vob
self._sess.run(self._opt_controller,{
self.nqTensor_controller: vob,
self.rewTensor_controller: r,
self.disTensor_controller: g,
self.actionTensor_controller: self._prev_action_controller,
self.stateTensor_controller: self._s_controller})
self._replayBuffer_controller.append([self._s_controller, self._prev_action_controller, r, g, vob])
for _ in range(self._num_offline_updates_controller):
replay = self._replayBuffer_controller[np.random.randint(len(self._replayBuffer_controller))]
self._sess.run(self._opt_controller,{
self.nqTensor_controller: replay[4],
self.rewTensor_controller: replay[2],
self.disTensor_controller: replay[3],
self.actionTensor_controller: replay[1],
self.stateTensor_controller: replay[0]})
elif self._rl_alg_controller == 'DQN':
# This function should return an action
# Optimiser
vob = np.max(q_nxtState)
self._sess.run(self._opt_controller,{
self.nqTensor_controller: vob,
self.rewTensor_controller: r,
self.disTensor_controller: g,
self.actionTensor_controller: self._prev_action_controller,
self.stateTensor_controller: self._s_controller})
self._replayBuffer_controller.append([self._s_controller, self._prev_action_controller, r, g, vob])
for _ in range(self._num_offline_updates_controller):
replay = self._replayBuffer_controller[np.random.randint(len(self._replayBuffer_controller))]
self._sess.run(self._opt_controller,{
self.nqTensor_controller: replay[4],
self.rewTensor_controller: replay[2],
self.disTensor_controller: replay[3],
self.actionTensor_controller: replay[1],
self.stateTensor_controller: replay[0]})
self._s_controller = np.reshape(s, (1,-1))
self._prev_action_controller = next_action
return next_action
def reset_controller(self):
tf.reset_default_graph()
self.handleTF()
self.resetState_controller()
self.resetReplayBuffer_controller()
self._probs_controller = np.ones((1, self._num_features_controller))/(self._num_features_controller*1.)
self._times_trained_controller = 0
self._prev_action_controller = 0
# resetReplayBuffer_driver
def resetReplayBuffer(self):
self._replayBuffer_driver = []
# resetState_driver
def resetState(self):
self._s_driver = self._initial_state_driver
self._s_driver = np.reshape(self._s_driver, (1,-1))
def _target_policy_driver(self, q, a):
return np.eye(len(q))[a]
def _behaviour_policy_driver(self, q, train):
#return epsilon_greedy(q, 0.1) if train == True else np.random.choice(np.where(np.max(q) == q)[0])
return epsilon_greedy(q, 0.2) if train == True else epsilon_greedy(q, 0.05)
def q_driver(self, obs):
#print [n.name for n in tf.get_default_graph().as_graph_def().node]
obs = np.reshape(obs,(1,-1))
#print obs
t = self._sess.run(self._q_driver, {self.stateTensor_driver: obs})
return t
# step_driver
def step(self, r, g, s, item, train):
cost = None
if item != None:
self._inventory.add(item)
# This function should return an action
q_nxtState = np.reshape(self.q_driver(s), (-1,))
next_action = self._behaviour_policy_driver(q_nxtState, train)
if self._rl_alg_driver == 'NEURALSARSA':
target = self._target_policy_driver(q_nxtState, next_action)
target = np.random.choice(np.where(np.max(target) == target)[0])
# Optimiser
vob = q_nxtState[target]
# cost = self._sess.run(self._cost_driver,{
# self.nqTensor_driver: vob,
# self.rewTensor_driver: r,
# self.disTensor_driver: g,
# self.actionTensor_driver: self._prev_action_driver,
# self.stateTensor_driver: self._s_driver})
if train == True:
self._sess.run(self._opt_driver,{
self.nqTensor_driver: vob,
self.rewTensor_driver: r,
self.disTensor_driver: g,
self.actionTensor_driver: self._prev_action_driver,
self.stateTensor_driver: self._s_driver})
self._replayBuffer_driver.append([self._s_driver, self._prev_action_driver, r, g, vob])
for _ in range(self._num_offline_updates_driver):
replay = self._replayBuffer_driver[np.random.randint(len(self._replayBuffer_driver))]
self._sess.run(self._opt_driver,{
self.nqTensor_driver: replay[4],
self.rewTensor_driver: replay[2],
self.disTensor_driver: replay[3],
self.actionTensor_driver: replay[1],
self.stateTensor_driver: replay[0]})
elif self._rl_alg_driver == 'DQN':
vob = np.max(q_nxtState)
# cost = self._sess.run(self._cost_driver,{
# self.nqTensor_driver: vob,
# self.rewTensor_driver: r,
# self.disTensor_driver: g,
# self.actionTensor_driver: self._prev_action_driver,
# self.stateTensor_driver: self._s_driver})
if train == True:
self._sess.run(self._opt_driver,{
self.nqTensor_driver: vob,
self.rewTensor_driver: r,
self.disTensor_driver: g,
self.actionTensor_driver: self._prev_action_driver,
self.stateTensor_driver: self._s_driver})
self._replayBuffer_driver.append([self._s_driver, self._prev_action_driver, r, g, vob])
for _ in range(self._num_offline_updates_driver):
replay = self._replayBuffer_driver[np.random.randint(len(self._replayBuffer_driver))]
self._sess.run(self._opt_driver,{
self.nqTensor_driver: replay[4],
self.rewTensor_driver: replay[2],
self.disTensor_driver: replay[3],
self.actionTensor_driver: replay[1],
self.stateTensor_driver: replay[0]})
self._s_driver = np.reshape(s, (1,-1))
self._prev_action_driver = next_action
return next_action, self._inventory, cost
class Random(object):
"""A random agent.
This agent returns an action between 0 and 'number_of_arms',
uniformly at random. The 'previous_action' argument of 'step'
is ignored.
"""
def __init__(self, number_of_arms):
self._number_of_arms = number_of_arms
self.name = 'random'
self.reset()
def step(self, previous_action, reward):
return np.random.randint(self._number_of_arms)
def getProbs(self):
return np.ones((self._number_of_arms))/self._number_of_arms
def reset(self):
pass
class REINFORCE(object):
def __init__(self, number_of_arms, step_size=0.1, baseline=False):
self._number_of_arms = number_of_arms
self._lr = step_size
self.name = 'reinforce, baseline: {}'.format(baseline)
self._baseline = baseline
self.action_values = np.zeros((2,self._number_of_arms))
self.action_preferences = np.zeros((1,self._number_of_arms))
self.total_reward = 0;
self.number_rewards = 0.
self.reset()
def step(self, previous_action, reward):
if previous_action != None:
self.number_rewards += 1.
self.total_reward += reward
self.action_values[0,previous_action] += reward
self.action_values[1,previous_action] += 1.
self.updatePreferences(previous_action, reward)
# unvisited = np.where(self.action_values[1,:] == 0.)
# if unvisited[0].size > 0:
# return unvisited[0][0]
# else:
# return np.random.choice(np.arange(0,self._number_of_arms),p=self.softmax())
return np.random.choice(np.arange(0,self._number_of_arms),p=self.softmax())
def reset(self):
self.action_values = np.zeros((2,self._number_of_arms))
self.action_preferences = np.zeros((1,self._number_of_arms))
self.number_rewards = 0.
self.total_reward = 0.
def updatePreferences(self, previous_action, reward):
if not self._baseline:
self.action_preferences[0,previous_action]+=self._lr*reward*(1-self.softmax()[previous_action])
for i in range(0,self._number_of_arms):
if i != previous_action:
self.action_preferences[0,i]-=self._lr*reward*self.softmax()[i]
else:
self.action_preferences[0,previous_action]+=self._lr*(reward - self.total_reward/self.number_rewards)*(1-self.softmax()[previous_action])
for i in range(0,self._number_of_arms):
if i != previous_action:
self.action_preferences[0,i]-=self._lr*(reward - self.total_reward/self.number_rewards)*self.softmax()[i]
def softmax(self):
q = np.sum(np.exp(self.action_preferences),axis=1)
t = np.exp(self.action_preferences)/q
return t.flatten()
def getProbs(self):
return self.softmax()
class EXP3(object):
def __init__(self, number_of_arms, gamma):
self._number_of_arms = number_of_arms
self.name = 'exp3 Gamma: ' + str(gamma)
self.action_values = np.zeros((2,self._number_of_arms))
self.gamma = gamma
self.weights = np.ones((1,self._number_of_arms))
self.time = 0.
self.reset()
def step(self, previous_action, reward):
if previous_action != None:
xhat = np.zeros((1, self._number_of_arms))
xhat[0,previous_action] = reward/self.action_values[0,previous_action]
self.weights = self.weights*np.exp(self.gamma*xhat/self._number_of_arms)
self.action_values[1,previous_action] += 1.
self.action_values[0,:] = (1-self.gamma)*(self.weights)/(np.sum(self.weights)) + self.gamma/self._number_of_arms
action = np.random.choice(self._number_of_arms, p=self.action_values[0,:])
self.time += 1.
unvisited = np.where(self.action_values[1,:] == 0.)
return unvisited[0][0] if unvisited[0].size > 0 else action
def getProbs(self):
return self.action_values[0,:]
def reset(self):
self.action_values = np.zeros((2, self._number_of_arms))
self.weights = np.ones((1, self._number_of_arms))
self.time = 0
return
class NS_DQN(object):
# Target Network is the same, as C-step is just C=1
def __init__(self, teacher_student, number_of_features, number_of_hidden, number_of_actions, initial_state, rl_alg='DQN', num_offline_updates=30, step_size=0.01):
tf.reset_default_graph()
self._prev_action = 0
self._step = step_size
self._num_features = number_of_features
self._num_action = number_of_actions
self._num_hidden = number_of_hidden
self._initial_state = initial_state
self._s = initial_state
self._s = np.reshape(self._s, (1,-1))
self._times_trained = 0
self._replayBuffer = []
self._num_offline_updates = num_offline_updates
self._rl_alg = rl_alg
self._teacher_student = teacher_student
self._probs = np.ones((1, self._num_action))/(self._num_action*1.)
self._inventory = set()
if self._teacher_student == True:
self.name = 'HYPER '+self._rl_alg
else:
self.name = self._rl_alg
self.handleTF()
def resetReplayBuffer(self):
self._replayBuffer = []
def resetState(self):
self._s = self._initial_state
self._s = np.reshape(self._s, (1,-1))
def handleTF(self):
self._sess = tf.Session()
#tf.reset_default_graph()
self.rewTensor = tf.placeholder(tf.float64)
self.disTensor = tf.placeholder(tf.float64)
self.nqTensor = tf.placeholder(tf.float64)
self.actionTensor = tf.placeholder(tf.int32)
self.stateTensor = tf.placeholder(tf.float64, shape=(1,self._num_features))
self._dense_1 = tf.layers.dense(self.stateTensor,
self._num_hidden, activation=tf.nn.relu,
kernel_initializer=tf.random_uniform_initializer(-0.05, 0.05),
bias_initializer=tf.random_uniform_initializer(-0.05, 0.05))
self._dense_2 = tf.layers.dense(self._dense_1,
self._num_action, activation=None,
kernel_initializer=tf.random_uniform_initializer(-0.05, 0.05),
bias_initializer=tf.random_uniform_initializer(-0.05, 0.05))
self._q = tf.reshape(self._dense_2, (self._num_action,))
self._softmx = tf.nn.softmax(self._q)
self._cost = tf.losses.mean_squared_error(self.rewTensor + self.disTensor*self.nqTensor, self._q[self.actionTensor])
self._opt = tf.train.GradientDescentOptimizer(self._step).minimize(self._cost)
# HMMM?
self._sess.run(tf.global_variables_initializer())
def _target_policy(self, q, a):
return np.eye(len(q))[a]
def _behaviour_policy(self, q, train):
if self._teacher_student == True:
return epsilon_greedy(q, 0.2)#if train == True else np.random.choice(np.where(np.max(q) == q)[0])
else:
return epsilon_greedy(q, 0.2) if train == True else epsilon_greedy(q, 0.05)
def getProbs(self):
# softmax
return self._probs
def q_noProbs(self, obs):
obs = np.reshape(obs,(1,-1))
t = self._sess.run(self._q, {self.stateTensor: obs})
return t
def q(self, obs):
obs = np.reshape(obs,(1,-1))
t, probs = self._sess.run([self._q, self._softmx], {self.stateTensor: obs})
return t, probs
def step(self, r, g, s, item=None, train=True):
cost = None
if item != None:
self._inventory.add(item)
qvs, probs = self.q(s)
q_nxtState = np.reshape(qvs, (-1,))
self._probs = probs
next_action = self._behaviour_policy(q_nxtState, train)
if r != None:
if self._rl_alg == 'NEURALSARSA':
target = self._target_policy(q_nxtState, next_action)
target = np.random.choice(np.where(np.max(target) == target)[0])
vob = q_nxtState[target]
if train == True:
self._sess.run(self._opt,{
self.nqTensor: vob,
self.rewTensor: r,
self.disTensor: g,
self.actionTensor: self._prev_action,
self.stateTensor: self._s})
self._replayBuffer.append([self._s, self._prev_action, r, g, vob])
for _ in range(self._num_offline_updates):
replay = self._replayBuffer[np.random.randint(len(self._replayBuffer))]
self._sess.run(self._opt,{
self.nqTensor: replay[4],
self.rewTensor: replay[2],
self.disTensor: replay[3],
self.actionTensor: replay[1],
self.stateTensor: replay[0]})
elif self._rl_alg == 'DQN':
vob = np.max(q_nxtState)
if train == True:
self._sess.run(self._opt,{
self.nqTensor: vob,
self.rewTensor: r,
self.disTensor: g,
self.actionTensor: self._prev_action,
self.stateTensor: self._s})
self._replayBuffer.append([self._s, self._prev_action, r, g, vob])
for _ in range(self._num_offline_updates):
replay = self._replayBuffer[np.random.randint(len(self._replayBuffer))]
self._sess.run(self._opt,{
self.nqTensor: replay[4],
self.rewTensor: replay[2],
self.disTensor: replay[3],
self.actionTensor: replay[1],
self.stateTensor: replay[0]})
self._s = np.reshape(s, (1,-1))
self._prev_action = next_action
return next_action, self._inventory, cost
def reset(self):
tf.reset_default_graph()
self.handleTF()
self.resetState()
self.resetReplayBuffer()
self._probs = np.ones((1, self._num_action))/(self._num_action*1.)
self._times_trained = 0
self._prev_action = 0
self._inventory = set()
class TaskSelector(object):
"""An adversarial multi-armed Task bandit."""
def __init__(self, rl_agent, tasks, reward_signal):
self._unscaled_reward_history = []
self._rl_agent = rl_agent
self._tasks = tasks
self._reward_signal = reward_signal
self._tasks_buffer = np.zeros((5,len(tasks)))
def resetReplayBuffer(self):
self._rl_agent.resetReplayBuffer()
def step(self, action_task_id):
if self._reward_signal == 'PG':
run_step(self._tasks[action_task_id], self._rl_agent, True)
reward_after, reward_steps_after = run_step(self._tasks[action_task_id], self._rl_agent, False)
elif self._reward_signal == 'GPG':
pass
elif self._reward_signal == 'SPG':
run_step(self._tasks[action_task_id], self._rl_agent, True)
reward_after, reward_steps_after = run_step(self._tasks[action_task_id], self._rl_agent, False)
elif self._reward_signal == 'TPG':
run_step(self._tasks[action_task_id], self._rl_agent, True)
reward_after, reward_steps_after = run_step(self._tasks[-1], self._rl_agent, False)
elif self._reward_signal == 'MPG':
uniform_sampled_task_id = np.random.choice(len(self._tasks))
run_step(self._tasks[action_task_id], self._rl_agent, True)
reward_after, reward_steps_after = run_step(self._tasks[uniform_sampled_task_id], self._rl_agent, False)
elif self._reward_signal == 'VCG':
pass
elif self._reward_signal == 'GVCG':
pass
elif self._reward_signal == 'L2G':
pass
self._tasks_buffer[:,action_task_id] = np.roll(self._tasks_buffer[:,action_task_id], 1)
self._tasks_buffer[0,action_task_id] = reward_after
X = np.arange(self._tasks_buffer.shape[0])
slope, _, _, _, _ = stats.linregress(X, self._tasks_buffer[:,action_task_id])
rhat = slope
self._unscaled_reward_history.append(rhat)
temp_history = np.array(sorted(self._unscaled_reward_history))
p_20 = np.percentile(temp_history, 20)
p_80 = np.percentile(temp_history, 80)
if action_task_id < 0 or action_task_id >= len(self._tasks):
raise ValueError('Action {} is out of bounds for a '
'{}-armed bandit'.format(action_task_id, len(split_train_tasks)))
r = None
if rhat <= p_20:
r = -1.
elif rhat > p_80:
r = 1.
else:
r = 2.0 * (rhat - p_20)/(p_80 - p_20) - 1.
#print reward_steps_after
# Perhaps, plot the variance or something else, because the train==False fucks this plot up
return r, reward_after, np.reshape(self._tasks_buffer.T,(-1,))
def plot_values(values, colormap='pink', vmin=None, vmax=None):
plt.imshow(values, interpolation="nearest", cmap=colormap, vmin=vmin, vmax=vmax)
plt.yticks([])
plt.xticks([])
plt.colorbar(ticks=[vmin, vmax])
def plot_action_values(action_values, title, vmin=None, vmax=None):
q = action_values
fig = plt.figure(figsize=(10, 10))
fig.subplots_adjust(wspace=0.3, hspace=0.3)
vmin = np.min(action_values)
vmax = np.max(action_values)
#print vmin, vmax
dif = vmax - vmin
for a in [0, 1, 2, 3]:
plt.subplot(3, 3, map_from_action_to_subplot(a))
plot_values(q[..., a], vmin=vmin - 0.05*dif, vmax=vmax + 0.05*dif)
action_name = map_from_action_to_name(a)
plt.title(r"$q(s, \mathrm{" + action_name + r"})$")
plt.subplot(3, 3, 5)
v = 0.9 * np.max(q, axis=-1) + 0.1 * np.mean(q, axis=-1)
plot_values(v, colormap='summer', vmin=vmin, vmax=vmax)
plt.title(r"$v(s), \mathrm{" + title + r"}$")
# plt.savefig('./action_values_{}'.format(title))
# plt.close()
def plot_greedy_policy(grid, title, q):
action_names = [r"$\uparrow$",r"$\rightarrow$", r"$\downarrow$", r"$\leftarrow$"]
greedy_actions = np.argmax(q, axis=2)
grid.plot_grid(title)
plt.hold('on')
for i in range(grid._layout.shape[0]):
for j in range(grid._layout.shape[1]):
action_name = action_names[greedy_actions[i,j]]
plt.text(j, i, action_name, ha='center', va='center')
def run_experiment(bandit, algs, tasks, number_of_steps_of_selecting_tasks, repetitions, vision_size, tabular, agent_type_driver, hidden_units, step_size, reward_signal):
"""Run multiple repetitions of a bandit experiment."""
reward_dict = {}
reward_delta_dict = {}
action_dict = {}
prob_dict = {}
entropy_dict = {}
for alg in algs:
print('Running:', alg.name)
reward_dict[alg.name] = []
reward_delta_dict[alg.name] = []
action_dict[alg.name] = []
prob_dict[alg.name] = []
entropy_dict[alg.name] = []
rl_agent = None
qs = None
for qq in range(repetitions):
print('Rep:', qq)
if isinstance(alg, NEURAL_CONTROLLER_DRIVER):
alg.reset()
bandit = TaskSelector(alg, tasks, reward_signal)
reward_dict[alg.name].append([0.])
reward_delta_dict[alg.name].append([])
action_dict[alg.name].append([])
prob_dict[alg.name].append([])
entropy_dict[alg.name].append([])
action = None
reward = None
prob = None
entropy = None
reward_delta = None
capability = alg._initial_state_controller
for i in range(number_of_steps_of_selecting_tasks):
try:
action = alg.step_controller(reward, 0.98, capability)
prob = alg.getProbs()
entropy = -1.0 * np.sum(prob * np.log(prob))
except:
raise ValueError(
"The step function of algorithm `{}` failed.\
Perhaps you have a bug, such as a typo.\
Or, perhaps your value estimates or policy has diverged.\
(E.g., internal quantities may have become NaNs.)\
Try adding print statements to see if you can find a bug.".format(alg.name))
reward, reward_from_environment, capability = bandit.step(action)
bandit.resetReplayBuffer()
reward_dict[alg.name][-1].append(reward_from_environment+reward_dict[alg.name][-1][-1])
reward_delta_dict[alg.name][-1].append(reward)
action_dict[alg.name][-1].append(action)
prob_dict[alg.name][-1].append(prob.copy())
entropy_dict[alg.name][-1].append(entropy)
h, w = tasks[-1]._layout.shape
obs = np.array([[tasks[-1].get_obs_at(x, y) for x in range(2, w-2)] for y in range(2, h-2)])
if qs is not None:
qs += np.array([[[alg.q_driver(o)[a] for a in range(4)] if o[vision_size,vision_size] == 0 else np.zeros((4,)) for o in ob] for ob in obs])
else:
qs = np.array([[[alg.q_driver(o)[a] for a in range(4)] if o[vision_size,vision_size] == 0 else np.zeros((4,)) for o in ob] for ob in obs])
else:
alg.reset()
rl_agent = None
if agent_type_driver == 'NEURALSARSA' or agent_type_driver == 'DQN':
rl_agent = NS_DQN(teacher_student=False,
number_of_features=(2*vision_size + 1)**2,
number_of_hidden=hidden_units,
number_of_actions=4,
initial_state=tasks[0].get_obs(),
rl_alg=agent_type_driver,
step_size=step_size)
elif agent_type_driver == 'Q':
rl_agent = GeneralQ(number_of_states=tasks[0]._layout.size,
number_of_actions=4,
initial_state=tasks[0].get_obs(),
target_policy=Q_target_policy,
behaviour_policy=gen_behaviour_policy,
double=True)
elif agent_type_driver == 'SARSA':
rl_agent = GeneralQ(number_of_states=tasks[0]._layout.size,
number_of_actions=4,
initial_state=tasks[0].get_obs(),
target_policy=SARSA_target_policy,
behaviour_policy=gen_behaviour_policy,
double=True)
bandit = TaskSelector(rl_agent, tasks, reward_signal)
reward_dict[alg.name].append([0.])
reward_delta_dict[alg.name].append([])
action_dict[alg.name].append([])
prob_dict[alg.name].append([])
entropy_dict[alg.name].append([])
action = None
reward = None
prob = None
entropy = None
reward_delta = None
capability = None
if 'HYPER' in alg.name:
capability = alg._initial_state
for i in range(number_of_steps_of_selecting_tasks):
try:
# This is for when the teacher is neural and student not
if 'HYPER' in alg.name:
action, _, _ = alg.step(reward, 0.98, capability)
else:
action = alg.step(action, reward)
prob = alg.getProbs()
entropy = -1.*np.sum(prob*np.log(prob))
except:
raise ValueError(
"The step function of algorithm `{}` failed.\
Perhaps you have a bug, such as a typo.\
Or, perhaps your value estimates or policy has diverged.\
(E.g., internal quantities may have become NaNs.)\
Try adding print statements to see if you can find a bug.".format(alg.name))
reward, reward_from_environment, capability = bandit.step(action)
#print reward_from_environment
bandit.resetReplayBuffer()
reward_dict[alg.name][-1].append(reward_from_environment+reward_dict[alg.name][-1][-1])
#print reward_dict[alg.name][-1]
reward_delta_dict[alg.name][-1].append(reward)
action_dict[alg.name][-1].append(action)
prob_dict[alg.name][-1].append(prob.copy())
entropy_dict[alg.name][-1].append(entropy)
if agent_type_driver == 'NEURALSARSA' or agent_type_driver == 'DQN':
h, w = tasks[-1]._layout.shape
obs = np.array([[tasks[-1].get_obs_at(x, y) for x in range(2, w-2)] for y in range(2, h-2)])
if qs is not None:
qs += np.array([[[rl_agent.q_noProbs(o)[a] for a in range(4)] if o[vision_size,vision_size] == 0 else np.zeros((4,)) for o in ob] for ob in obs])
else:
qs = np.array([[[rl_agent.q_noProbs(o)[a] for a in range(4)] if o[vision_size,vision_size] == 0 else np.zeros((4,)) for o in ob] for ob in obs])
elif agent_type_driver == 'Q' or agent_type_driver == 'SARSA':
if qs is not None:
qs += rl_agent.q_values.reshape(tasks[-1]._layout.shape + (4,))
else:
qs = rl_agent.q_values.reshape(tasks[-1]._layout.shape + (4,))
#print('')
qs /= repetitions
plot_action_values(qs, alg.name)
return reward_dict, reward_delta_dict, action_dict, prob_dict, entropy_dict
def train_task_agents(agents, number_of_arms, number_of_steps_of_selecting_tasks, tasks, reward_signal, repetitions=1, vision_size=1, tabular=False, agent_type_driver='norm', hidden_units=100, step_size=0.01):
bandit = None
reward_dict, reward_delta_dict, action_dict, prob_dict, entropy_dict = run_experiment(bandit, agents, tasks, number_of_steps_of_selecting_tasks, repetitions, vision_size, tabular, agent_type_driver, hidden_units, step_size, reward_signal)
smoothed_rewards = {}
smoothed_rewards_stds = {}
smoothed_reward_deltas = {}
smoothed_reward_deltas_stds = {}
smoothed_actions = {}
smoothed_probs = {}
smoothed_entropies = {}
smoothed_entropies_stds = {}
agent_set = set()
for agent, rewards in reward_dict.items():
agent_set.add(agent)
smoothed_rewards[agent] = (np.sum(np.array([np.array(x) for x in rewards]), axis=0)).T
smoothed_rewards_stds[agent] = (np.std(np.array([np.array(x) for x in rewards]), axis=0)).T
for agent, rewards in reward_delta_dict.items():
agent_set.add(agent)
smoothed_reward_deltas[agent] = (np.sum(np.array([np.array(x) for x in rewards]), axis=0)).T
smoothed_reward_deltas_stds[agent] = (np.std(np.array([np.array(x) for x in rewards]), axis=0)).T
for agent, probs in prob_dict.items():
smoothed_probs[agent] = (np.sum(np.array([np.array(x) for x in probs]), axis=0)).T
for agent, entropies in entropy_dict.items():
smoothed_entropies[agent] = (np.sum(np.array([np.array(x) for x in entropies]), axis=0)).T
smoothed_entropies_stds[agent] = (np.std(np.array([np.array(x) for x in entropies]), axis=0)).T
for agent in agent_set:
smoothed_probs[agent] /= repetitions
plt.figure(figsize=(22,20))
plt.imshow(smoothed_probs[agent], interpolation=None)
plt.title('Teacher: {}, Student: {}, Reward Signal: {}'.format(agent, agent_type_driver, reward_signal))
plt.xlabel('Time')
plt.ylabel('Task')
# plt.savefig('./'+agent + ', Reward Signal: {}; {}'.format(reward_signal, agent_type_driver_driver))
# plt.close()
plt.figure(figsize=(12,12))
plt.title('Cumulative Average Reward, Student: {}, Reward Signal: {}'.format(agent_type_driver, reward_signal))
plt.ylabel('Reward')
plt.xlabel('Time')
for agent in agent_set:
smoothed_rewards[agent] /= repetitions
plot = plt.plot(smoothed_rewards[agent], label=agent)
plt.fill_between(np.arange(smoothed_rewards[agent].shape[0]), smoothed_rewards[agent]-smoothed_rewards_stds[agent], smoothed_rewards[agent]+smoothed_rewards_stds[agent], alpha=0.15, facecolor=plot[-1].get_color())
plt.legend(loc='upper right')
# plt.savefig('./Average Reward, \t Reward Signal: {}; {}'.format(reward_signal, agent_type_driver))
# plt.close()
plt.figure(figsize=(12,12))
plt.title('Delta Average Reward, Student: {}, Reward Signal: {}'.format(agent_type_driver, reward_signal))
plt.ylabel('Delta')
plt.xlabel('Time')
for agent in agent_set:
smoothed_reward_deltas[agent] /= repetitions
plt.plot(smoothed_reward_deltas[agent], label=agent)
plt.legend(loc='upper right')
# plt.savefig('./Delta Reward, \t Reward Signal: {}; {}'.format(reward_signal, agent_type_driver))
# plt.close()
plt.figure(figsize=(12,12))
plt.title('Entropy, Student: {}, Reward Signal: {}'.format(agent_type_driver, reward_signal))
plt.ylabel('Policy Entropy')
plt.xlabel('Time')
for agent in agent_set:
smoothed_entropies[agent] /= repetitions
plot = plt.plot(smoothed_entropies[agent], label=agent)
plt.fill_between(np.arange(smoothed_entropies[agent].shape[0]), smoothed_entropies[agent]-smoothed_entropies_stds[agent], smoothed_entropies[agent]+smoothed_entropies_stds[agent], alpha=0.15, facecolor=plot[-1].get_color())
plt.legend(loc='upper right')
# plt.savefig('./Maximum Likelihood, \t Reward Signal: {}; {}'.format(reward_signal, agent_type_driver))
# plt.close()
def run_step(env, agent, train):
env.resetState()
agent.resetState()
number_of_steps = env.distanceToGoal()
try:
action = agent.initial_action()
agent_inventory = agent._inventory
except AttributeError:
action = 0
agent_inventory = agent._inventory
steps_completed = 0
total_reward = 0.
while steps_completed != number_of_steps:
reward, discount, next_state, item = env.step(action, agent_inventory)
if item != None:
agent._inventory.add(item)
total_reward += reward
# Dont want to remove the key on train==True, cuz then cant get reward on train==false, where we record the reward
if reward == 100 and train == False:
agent._inventory.remove('KEY')
if discount == 0:
return total_reward, total_reward/steps_completed
action, agent_inventory, _ = agent.step(reward, discount, next_state, item, train)
steps_completed += 1
mean_reward = total_reward/number_of_steps
return total_reward, mean_reward
def run_episode(env, agent, number_of_episodes, train):
# Mean Reward across all the episodes( aka all steps )
mean_reward = 0.
# Mean Duration per episode
mean_duration = 0.
# List of (Total Reward Per Epsiode)/(Duration)
signal_per_episode = np.zeros((1, number_of_episodes))
reward_per_episode = np.zeros((1, number_of_episodes))
duration_per_episode = np.zeros((1, number_of_episodes))
try:
action = agent.initial_action()
except AttributeError:
action = 0
episodes_completed = 0
total_reward_per_episode = 0.
start_time = time.time()
i = 0.
while episodes_completed != number_of_episodes:
reward, discount, next_state = env.step(action)
total_reward_per_episode += reward
if discount == 0:
duration = time.time() - start_time
signal_per_episode[0,episodes_completed] = (total_reward_per_episode/duration)
reward_per_episode[0,episodes_completed] = (total_reward_per_episode)
duration_per_episode[0,episodes_completed] = (duration)
episodes_completed += 1
mean_duration += (duration - mean_duration)/(episodes_completed)
start_time = time.time()
total_reward_per_episode = 0.
action = agent.step(reward, discount, next_state, train)
mean_reward += (reward - mean_reward)/(i + 1.)
i += 1.
mean_signal = np.mean(signal_per_episode)
total_reward = np.sum(reward_per_episode)
total_duration = np.sum(duration_per_episode)
return total_reward, total_duration
map_from_action_to_subplot = lambda a: (2, 6, 8, 4)[a]
map_from_action_to_name = lambda a: ("up", "right", "down", "left")[a]
def epsilon_greedy(q_values, epsilon):
if epsilon < np.random.random():
return np.argmax(q_values)
else:
return np.random.randint(np.array(q_values).shape[-1])
number_of_steps_of_selecting_tasks = 800
reps = 5
reward_signals=['MPG','SPG']
drivers = ['SARSA','Q']
hidden_units_controller_net = 100
hidden_units_driver_net = 100
vision_size = 1
tabular_grid = False
step_size = 0.01
# goal_loc has format (row, col)
tasks = []
#tasks.append(Hallway(goal_loc = [(8,2,-3), (2,2,5), (2,13,5), (8,13,100)], tabular=tabular_grid, vision_size=vision_size, discount=0.98))
tasks.append(Hallway(goal_loc = [(8,1,5), (8,2,10), (8,3,5), (2,1,5), (2,2,5), (2,3,5), (2,12,5), (2,13,5), (2,14,5), (5,4,5), (8,13,100)], discount=0.98))
for task in tasks:
task.plot_grid()
# Intrinsically Motivated Curriculum Learning
number_of_arms_tasks = len(tasks)
agents = [
Random(number_of_arms_tasks),
]
for reward_signal in reward_signals:
for driver in drivers:
train_task_agents(agents,
number_of_arms_tasks,
number_of_steps_of_selecting_tasks,
tasks,
reward_signal,
reps,
vision_size,
tabular_grid,
driver,
hidden_units_driver_net)
vision_size = 1
tabular_grid = False
step_size = 0.01
# goal_loc has format (row, col)
tasks = []
tasks.append(Hallway(goal_loc = [(5, 4, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 1, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 2, 10)], discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 3, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 1, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 2, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 3, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 12, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 13, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 14, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 13, 100)], discount=0.98))
for task in tasks:
task.plot_grid()
# Intrinsically Motivated Curriculum Learning
number_of_arms_tasks = len(tasks)
agents = [
EXP3(number_of_arms_tasks, 0.2),
REINFORCE(number_of_arms_tasks, baseline=False),
REINFORCE(number_of_arms_tasks, baseline=True),
NS_DQN(teacher_student=True,
number_of_features=number_of_arms_tasks*5,
number_of_hidden=hidden_units_controller_net,
number_of_actions=number_of_arms_tasks,
initial_state=np.zeros((1,number_of_arms_tasks*5)),
rl_alg='DQN',
step_size=step_size),
NS_DQN(teacher_student=True,
number_of_features=number_of_arms_tasks*5,
number_of_hidden=hidden_units_controller_net,
number_of_actions=number_of_arms_tasks,
initial_state=np.zeros((1,number_of_arms_tasks*5)),
rl_alg='NEURALSARSA',
step_size=step_size),
Random(number_of_arms_tasks),
]
for reward_signal in reward_signals:
for driver in drivers:
train_task_agents(agents,
number_of_arms_tasks,
number_of_steps_of_selecting_tasks,
tasks,
reward_signal,
reps,
vision_size,
tabular_grid,
driver,
hidden_units_driver_net)
number_of_steps_of_selecting_tasks = 500
reps = 1
# reward_signals=['MPG','SPG']
# drivers = ['NEURALSARSA','DQN']
reward_signals=['MPG','SPG']
hidden_units_controller_net = 100
hidden_units_driver_net = 100
vision_size = 1
tabular_grid = False
step_size_controller = 0.01
step_size_driver = 0.01
# goal_loc has format (row, col)
tasks = []
tasks.append(Hallway(goal_loc = [(5, 4, 5)], tabular=tabular_grid, vision_size=vision_size, discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 1, 5)], tabular=tabular_grid, vision_size=vision_size, discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 2, 10)], tabular=tabular_grid, vision_size=vision_size, discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 3, 5)], tabular=tabular_grid, vision_size=vision_size, discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 1, 5)], tabular=tabular_grid, vision_size=vision_size, discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 2, 5)], tabular=tabular_grid, vision_size=vision_size, discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 3, 5)], tabular=tabular_grid, vision_size=vision_size, discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 12, 5)], tabular=tabular_grid, vision_size=vision_size, discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 13, 5)], tabular=tabular_grid, vision_size=vision_size, discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 14, 5)], tabular=tabular_grid, vision_size=vision_size, discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 13, 100)], tabular=tabular_grid, vision_size=vision_size, discount=0.98))
for task in tasks:
task.plot_grid()
# Intrinsically Motivated Curriculum Learning
number_of_arms_tasks = len(tasks)
agents = [
NEURAL_CONTROLLER_DRIVER(number_of_arms_tasks*5,
(2*vision_size + 1)**2,
hidden_units_controller_net,
hidden_units_driver_net,
number_of_arms_tasks,
4,
np.zeros((1,number_of_arms_tasks*5)),
tasks[0].get_obs(),
'NEURALSARSA',
'DQN',
num_offline_updates_controller=25,
num_offline_updates_driver=25,
step_size_controller=step_size_controller,
step_size_driver=step_size_driver),
NEURAL_CONTROLLER_DRIVER(number_of_arms_tasks*5,
(2*vision_size + 1)**2,
hidden_units_controller_net,
hidden_units_driver_net,
number_of_arms_tasks,
4,
np.zeros((1,number_of_arms_tasks*5)),
tasks[0].get_obs(),
'DQN',
'DQN',
num_offline_updates_controller=25,
num_offline_updates_driver=25,
step_size_controller=step_size_controller,
step_size_driver=step_size_driver),
NEURAL_CONTROLLER_DRIVER(number_of_arms_tasks*5,
(2*vision_size + 1)**2,
hidden_units_controller_net,
hidden_units_driver_net,
number_of_arms_tasks,
4,
np.zeros((1,number_of_arms_tasks*5)),
tasks[0].get_obs(),
'DQN',
'NEURALSARSA',
num_offline_updates_controller=25,
num_offline_updates_driver=25,
step_size_controller=step_size_controller,
step_size_driver=step_size_driver),
NEURAL_CONTROLLER_DRIVER(number_of_arms_tasks*5,
(2*vision_size + 1)**2,
hidden_units_controller_net,
hidden_units_driver_net,
number_of_arms_tasks,
4,
np.zeros((1,number_of_arms_tasks*5)),
tasks[0].get_obs(),
'NEURALSARSA',
'NEURALSARSA',
num_offline_updates_controller=25,
num_offline_updates_driver=25,
step_size_controller=step_size_controller,
step_size_driver=step_size_driver),
# Random(number_of_arms_tasks),
]
for reward_signal in reward_signals:
train_task_agents(agents,
number_of_arms_tasks,
number_of_steps_of_selecting_tasks,
tasks,
reward_signal,
reps,
vision_size,
tabular_grid)
# agents = [
# Random(number_of_arms_tasks),
# ]
# for reward_signal in reward_signals:
# for driver in drivers:
# train_task_agents(agents,
# number_of_arms_tasks,
# number_of_steps_of_selecting_tasks,
# tasks,
# reward_signal,
# reps,
# vision_size,
# tabular_grid,
# driver,
# hidden_units_driver_net)